First, exploring the best month to buy and sell a home in King County, Washington

I start by uploading the dataset and cleaning it.

library(readr)

kingcountysales <- read_csv("C:/Users/tjf4x/Desktop/R projects/King county home sales/kingcountysales_2000_2023.csv")

# Clean and transforming data

library(tidyverse)
library(lubridate)

glimpse(kingcountysales)
## Rows: 575,319
## Columns: 49
## $ ...1             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ sale_id          <chr> "1999..144", "1999..258", "1999..331", "1999..609", "…
## $ pinx             <chr> "..2734100475", "..1535200725", "..6028000255", "..12…
## $ sale_date        <chr> "1/5/1999", "1/5/1999", "1/4/1999", "1/11/1999", "1/7…
## $ sale_price       <dbl> 150000, 235000, 293000, 178506, 270000, 184250, 17500…
## $ sale_nbr         <dbl> 1, 1, 1, 1, 1, 1, 1, NA, 1, 1, 1, NA, 1, 1, 1, NA, 1,…
## $ sale_warning     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "34", NA,…
## $ join_status      <chr> "demo", "demo", "demo", "demo", "demo", "demo", "demo…
## $ join_year        <dbl> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999,…
## $ latitude         <dbl> 47.54436, 47.42125, 47.57210, 47.68557, 47.76448, 47.…
## $ longitude        <dbl> -122.3208, -122.4381, -122.1223, -122.1857, -122.2081…
## $ area             <dbl> 78, 100, 31, 74, 38, 8, 18, 49, 70, 81, 79, 55, 69, 2…
## $ city             <chr> "SEATTLE", "KING COUNTY", "KING COUNTY", "KIRKLAND", …
## $ zoning           <chr> "SF 5000", "RA2.5P", "R6", "RS8.5", "R15 OP", "SF 720…
## $ subdivision      <chr> "GEORGETOWN", "CHAUTAUQUA BEACH ADD", "NELSONS H E EA…
## $ present_use      <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ land_val         <dbl> 21000, 46000, 66000, 80000, 73000, 84000, 35000, 9100…
## $ imp_val          <dbl> 66000, 132000, 207000, 69000, 119000, 75000, 122000, …
## $ year_built       <dbl> 1900, 1916, 1986, 1961, 1924, 1920, 1955, 1986, 1967,…
## $ year_reno        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sqft_lot         <dbl> 4000, 6695, 12360, 9450, 13600, 7200, 12554, 20300, 1…
## $ sqft             <dbl> 1410, 990, 2020, 880, 2050, 790, 2160, 2230, 560, 151…
## $ sqft_1           <dbl> 760, 990, 1470, 880, 1300, 790, 1440, 1240, 560, 750,…
## $ sqft_fbsmt       <dbl> 0, 0, 0, 0, 0, 0, 720, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ grade            <dbl> 7, 6, 8, 6, 7, 7, 6, 9, 3, 7, 6, 5, 7, 8, 7, 7, 7, 6,…
## $ fbsmt_grade      <dbl> 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,…
## $ condition        <dbl> 3, 4, 3, 3, 5, 3, 3, 3, 3, 5, 3, 4, 3, 4, 5, 3, 3, 3,…
## $ stories          <dbl> 1.5, 1.0, 2.0, 1.0, 1.5, 1.0, 1.0, 1.5, 1.0, 2.0, 1.0…
## $ beds             <dbl> 3, 2, 3, 2, 5, 2, 4, 4, 1, 3, 3, 4, 4, 3, 3, 3, 3, 3,…
## $ bath_full        <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
## $ bath_3qtr        <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,…
## $ bath_half        <dbl> 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,…
## $ garb_sqft        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ gara_sqft        <dbl> 0, 0, 480, 0, 0, 0, 0, 730, 0, 0, 0, 0, 480, 580, 0, …
## $ wfnt             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ golf             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ greenbelt        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ noise_traffic    <dbl> 2, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0,…
## $ view_rainier     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_olympics    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_cascades    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_territorial <dbl> 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_skyline     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_sound       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_lakewash    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_lakesamm    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_otherwater  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view_other       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ submarket        <chr> "E", "H", "R", "Q", "Q", "C", "F", "G", "N", "E", "E"…
# Removing duplicates

distinct(kingcountysales)
## # A tibble: 575,319 × 49
##     ...1 sale_id    pinx  sale_date sale_price sale_nbr sale_warning join_status
##    <dbl> <chr>      <chr> <chr>          <dbl>    <dbl> <chr>        <chr>      
##  1     1 1999..144  ..27… 1/5/1999      150000        1 <NA>         demo       
##  2     2 1999..258  ..15… 1/5/1999      235000        1 <NA>         demo       
##  3     3 1999..331  ..60… 1/4/1999      293000        1 <NA>         demo       
##  4     4 1999..609  ..12… 1/11/1999     178506        1 <NA>         demo       
##  5     5 1999..775  ..19… 1/7/1999      270000        1 <NA>         demo       
##  6     6 1999..957  ..52… 1/15/1999     184250        1 <NA>         demo       
##  7     7 1999..1155 ..34… 1/19/1999     175000        1 <NA>         demo       
##  8     8 1999..1267 ..06… 1/11/1999     359850       NA <NA>         demo       
##  9     9 1999..1335 ..73… 1/7/1999      105000        1 <NA>         demo       
## 10    10 1999..1354 ..33… 1/14/1999     160000        1 <NA>         demo       
## # ℹ 575,309 more rows
## # ℹ 41 more variables: join_year <dbl>, latitude <dbl>, longitude <dbl>,
## #   area <dbl>, city <chr>, zoning <chr>, subdivision <chr>, present_use <dbl>,
## #   land_val <dbl>, imp_val <dbl>, year_built <dbl>, year_reno <dbl>,
## #   sqft_lot <dbl>, sqft <dbl>, sqft_1 <dbl>, sqft_fbsmt <dbl>, grade <dbl>,
## #   fbsmt_grade <dbl>, condition <dbl>, stories <dbl>, beds <dbl>,
## #   bath_full <dbl>, bath_3qtr <dbl>, bath_half <dbl>, garb_sqft <dbl>, …

Selecting appropriate variables and converting variables.

I converted the cities into factors in case I wanted to do further analysis broken down by city.

# selecting variables and converting date

king_time <- kingcountysales %>% 
  select(sale_date, sale_price, city) %>% 
  mutate(date = mdy(sale_date))

# converting cities to factors

unique(king_time$city)
##  [1] "SEATTLE"          "KING COUNTY"      "KIRKLAND"         "BOTHELL"         
##  [5] "NORMANDY PARK"    "PACIFIC"          "ISSAQUAH"         "COVINGTON"       
##  [9] "KENT"             "BELLEVUE"         "SHORELINE"        "FEDERAL WAY"     
## [13] "REDMOND"          "BURIEN"           "MERCER ISLAND"    "LAKE FOREST PARK"
## [17] "KENMORE"          "AUBURN"           "HUNTS POINT"      "TUKWILA"         
## [21] "RENTON"           "MAPLE VALLEY"     "SEA-TAC"          "CLYDE HILL"      
## [25] "MEDINA"           "SNOQUALMIE"       "ENUMCLAW"         "NEWCASTLE"       
## [29] "DES MOINES"       "NORTH BEND"       "BLACK DIAMOND"    "WOODINVILLE"     
## [33] "DUVALL"           "CARNATION"        "ALGONA"           "YARROW POINT"    
## [37] "BEAUX ARTS"       "SKYKOMISH"        "MILTON"           "SAMMAMISH"       
## [41] "SeaTac"
king_time$city <- as.factor(king_time$city)


# Checking for missing values

sum(is.na(king_time))
## [1] 0

Calculating the monthly median home sale and creating a series of plots.

# Calculating median by month

library(timetk)

king_monthly <- 
  king_time %>% 
  summarize_by_time(.date_var = date,
                    .by = "month",
                    median_sale = median(sale_price, na.rm = TRUE))

head(king_monthly, 12)
## # A tibble: 12 × 2
##    date       median_sale
##    <date>           <dbl>
##  1 1999-01-01      222000
##  2 1999-02-01      225475
##  3 1999-03-01      223250
##  4 1999-04-01      230000
##  5 1999-05-01      226000
##  6 1999-06-01      235000
##  7 1999-07-01      235000
##  8 1999-08-01      235920
##  9 1999-09-01      235000
## 10 1999-10-01      229970
## 11 1999-11-01      236250
## 12 1999-12-01      233800
# Plotting monthly median sale price

plot_time_series(king_monthly, 
                 .date_var = date, 
                 .value = median_sale,
                 .interactive = TRUE,
                 .x_lab = "Monthy Data",
                 .y_lab = "Median Sale price")
# Plotting monthly sales to find best months to buy or sell homes in king county

plot_seasonal_diagnostics(king_monthly, .date_var = date, .value = median_sale)
# Best month to sell by median home price is June

# Best month to buy is January

I wanted to see the accuracy of a simple univariant forecast model of the median home price. I did this using the Fable package and compared a couple different options

# Building a simple univariant forecast model
# -------------------------------------------

# Creating testing and training split

library(rsample)

king_monthly_split <- initial_time_split(king_monthly_post2012, prop = 130/136)

king_training <- training(king_monthly_split)
king_testing <- testing(king_monthly_split)

# converting data to a tibble and setting index as date

library(fable)
library(tsibble)

king_training <- 
  king_training %>% 
  mutate(date = yearmonth(date)) %>% 
  as_tsibble(index = date)

king_testing <- 
  king_testing %>% 
  mutate(date = yearmonth(date)) %>% 
  as_tsibble(index = date)

king_monthly_post2012 <- 
  king_monthly_post2012 %>% 
  mutate(date = yearmonth(date)) %>% 
  as_tsibble(index = date)

# Train the models

library(feasts)

king_fit <- 
  king_training %>% 
  model(stepwise = ARIMA(median_sale),
        search = ARIMA(median_sale, stepwise=FALSE))

# Viewing model
tidy(king_fit)
## # A tibble: 6 × 6
##   .model   term     estimate std.error statistic  p.value
##   <chr>    <chr>       <dbl>     <dbl>     <dbl>    <dbl>
## 1 stepwise ar1         0.857    0.0488     17.5  3.70e-34
## 2 stepwise sma1       -0.789    0.0945     -8.35 1.82e-13
## 3 stepwise constant 7027.     584.         12.0  4.84e-22
## 4 search   ar1         0.857    0.0488     17.5  3.70e-34
## 5 search   sma1       -0.789    0.0945     -8.35 1.82e-13
## 6 search   constant 7027.     584.         12.0  4.84e-22
king_fit %>% 
  accuracy() %>% 
  arrange(MAPE)
## # A tibble: 2 × 10
##   .model   .type       ME   RMSE    MAE    MPE  MAPE  MASE RMSSE   ACF1
##   <chr>    <chr>    <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 stepwise Training -426. 18360. 13356. -0.188  1.98 0.251 0.283 0.0519
## 2 search   Training -426. 18360. 13356. -0.188  1.98 0.251 0.283 0.0519
# Checking forecast of training data vs full data set

king_fit %>% 
  forecast(h = "6 months") %>% 
  autoplot(king_monthly_post2012)

The model performed very well. Better than I expected given it is only using past median sales data without any external economic factors

I decided to train the model on the full dataset that ends in December of 2023 and see how it performs compared to current monthly median sales.

# final fit on full data set

king_final_fit <- 
  king_monthly_post2012 %>% 
  model(stepwise = ARIMA(median_sale),
        search = ARIMA(median_sale, stepwise=FALSE))

# Fitting final model on full data set

king_final_fit %>% 
  forecast(h = "12 months") %>% 
  autoplot(king_monthly_post2012) +
  labs(x = "Month", y = "median home price")

king_final_fit %>% 
  forecast(h = "12 months") %>% 
  autoplot() +
  labs(x = "Month", y = "median home price")

king_monthly_t <- 
  king_monthly %>% 
  mutate(date = yearmonth(date)) %>% 
  as_tsibble(index = date)

Compared to the latest data for April of 2024, the king county median home sale was $895,000. That is still in the 80% confidence interval but right on the low end. The model is being too optimistic. This is likely because the model has mostly only seen data since the recovery from the 2008 financial crisis.

I added the full data set back in (since 2002) and retrained the model.

# The data ends in December of 2023, but referencing median sales in recent
# we can see the forecast starts to get too optimistic. I added back in the housing 
# crisis data and retrained.

king_final_fit_full <- 
  king_monthly_t %>% 
  model(stepwise = ARIMA(median_sale),
        search = ARIMA(median_sale, stepwise=FALSE))

king_final_fit_full %>% 
  forecast(h = "12 months") %>% 
  autoplot(king_monthly_t) +
  labs(x = "Month", y = "median home price")

king_final_fit_full %>% 
  forecast(h = "12 months") %>% 
  autoplot() +
  labs(x = "Month", y = "median home price")

# Adding the full time series makes the model more conservative and makes the 
# forecast more accurate.

The model is now almost right on without any additional information. In the future I would like to add in some economic indicators to try and narrow the confidence interval.